The goal of this notebook is to compare pseudobulk and bulk
calculations to determine which pseudobulk calculation should we proceed
with for modeling: the log2 of the sum of raw counts
(pseudobulk_log_counts) or the
DESeq2-normalized sum of raw counts
(pseudobulk_deseq). To this end, we’ll explore pseudobulk
expression distributions, compare them to bulk, and also explore
distributions of expression where there is disagreement between bulk and
single-cell. We also compare to bulk counts to see which quantity more
closely approximates those values.
renv::load()
library(ggplot2)
theme_set(theme_bw())
data_dir <- here::here("analysis", "pseudobulk-bulk-prediction", "data")
tpm_dir <- file.path(data_dir, "tpm")
pseudobulk_dir <- file.path(data_dir, "pseudobulk")
bulk_counts_file <- file.path(data_dir, "scpca_data", "normalized_bulk_counts.rds")
tpm_files <- list.files(
path = tpm_dir,
full.names = TRUE,
pattern = "-tpm\\.tsv$"
)
tpm_names <- stringr::str_split_i(basename(tpm_files), pattern = "-", i = 1)
names(tpm_files) <- tpm_names
pseudobulk_files <- list.files(
path = pseudobulk_dir,
full.names = TRUE,
pattern = "-pseudobulk\\.tsv$"
)
pseudobulk_names <- stringr::str_split_i(basename(pseudobulk_files), pattern = "-", i = 1)
names(pseudobulk_files) <- pseudobulk_names
# Make sure we have the same projects, in the same order
stopifnot(
all.equal(names(tpm_files), names(pseudobulk_files))
)
We’ll make both a long and wide version of the data for convenience throughout the notebook.
project_long_df_list <- purrr::map2(
tpm_files,
pseudobulk_files,
\(tpm_file, pseudo_file) {
dplyr::bind_rows(
# TPM needs to be in log2 space
readr::read_tsv(tpm_file, show_col_types = FALSE) |>
dplyr::mutate(expression = log2(expression)),
readr::read_tsv(pseudo_file, show_col_types = FALSE)
)
}
)
# Make a wide version as well
project_wide_df_list <- project_long_df_list |>
purrr::map(
\(df) {
df |>
tidyr::pivot_wider(names_from = expression_type, values_from = expression)
}
)
We’ll read in the bulk counts data as well which have been normalized with DESeq2.
# we only want to keep these samples from the bulk counts
present_samples <- project_wide_df_list |>
purrr::map(
\(df) {
df |>
dplyr::pull(sample_id) |>
unique()
}
) |>
purrr::reduce(c)
# Make a list of data frames of bulk counts, normalized by DESeq2
bulk_counts_wide_df_list <- readr::read_rds(bulk_counts_file) |>
purrr::map(
\(df) {
df |>
dplyr::filter(sample_id %in% present_samples) |>
dplyr::rename(ensembl_id = gene_id)
}
)
Bind it up for later comparisons:
project_wide_df_list <- purrr::map2(
bulk_counts_wide_df_list,
project_wide_df_list,
\(df_counts, df_main) {
df_main |>
dplyr::left_join(df_counts, by = c("ensembl_id", "sample_id"))
}
)
First, we’ll visualize distributions of all quantities:
ggplot(purrr::list_rbind(project_long_df_list, names_to = "project_id")) +
aes(x = expression, fill = expression_type) +
geom_density(alpha = 0.5) +
scale_fill_brewer(palette = "Dark2") +
facet_grid(
rows = vars(expression_type),
cols = vars(project_id),
scales = "free_y"
) +
theme(legend.position = "none")
We see big spikes at zero for pseudobulk, not surprisingly. Due to
the different transformation approaches, the
pseudobulk_deseq version has some negatives for fractional
values, but the other quantities have a lower bound of zero. All around,
distributions range from their lower bound up to around 20, so it’s nice
to know pseudobulk and bulk are definitely on the same scale.
This section will look at the relationship among quantities.
In this section, we’ll look at some scatterplots:
y=x, and the blue line is the regression line.project_wide_df_list |>
purrr::imap(
\(df, project_id) {
ggplot(df) +
aes(x = pseudobulk_deseq, y = pseudobulk_log_counts) +
geom_point(alpha = 0.2, size = 0.5) +
geom_smooth(method = "lm", linewidth = 0.5) +
geom_abline(linewidth = 0.5, color = "red") +
facet_wrap(vars(sample_id), nrow = 5) +
ggtitle(project_id)
}
) |>
patchwork::wrap_plots(ncol = 1)
These quantities are exceptionally similar with these differences:
pseudobulk_log_counts appears to have a higher proportion
of low to zero counts, and throughout has lower values than
pseudobulk_deseq.We’d like to also get a sense of how this data compares to counts directly.
# Helper function to visualize scatterplots with geom_bin_2d()
make_binned_scatterplots <- function(df, project_id, yvar, nbins, facet_rows) {
p1 <- ggplot(df) +
aes(x = pseudobulk_deseq, y = {{yvar}}) +
geom_bin_2d(bins = nbins) +
geom_smooth(method = "lm", alpha = 0.8, linewidth = 0.5) +
geom_abline(alpha = 0.8, linewidth = 0.5, color = "red") +
facet_wrap(vars(sample_id), nrow = facet_rows) +
ggtitle("~ deseq")
p2 <- ggplot(df) +
aes(x = pseudobulk_log_counts, y = {{yvar}}) +
geom_bin_2d(bins = nbins) +
geom_smooth(method = "lm", alpha = 0.8, linewidth = 0.5) +
geom_abline(alpha = 0.8, linewidth = 0.5, color = "red") +
facet_wrap(vars(sample_id), nrow = facet_rows) +
ggtitle("~ log_counts")
print(
patchwork::wrap_plots(p1, p2, ncol = 2) + patchwork::plot_annotation(title = project_id)
)
}
make_binned_scatterplots(
project_wide_df_list$SCPCP000001,
project_id = "SCPCP000001",
yvar = bulk_counts,
nbins = 15,
facet_rows = 6
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000002,
project_id = "SCPCP000002",
yvar = bulk_counts,
nbins = 15,
facet_rows = 6
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000006,
project_id = "SCPCP000006",
yvar = bulk_counts,
nbins = 15,
facet_rows = 9
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000009,
project_id = "SCPCP000009",
yvar = bulk_counts,
nbins = 15,
facet_rows = 1
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000017,
project_id = "SCPCP000017",
yvar = bulk_counts,
nbins = 40,
facet_rows = 7
)
Let’s now get some stats for the comparison between bulk counts and pseudobulk. We’ll fit a linear model for each sample, and display some quantities below both as boxplots and the full table.
# Helper functions to plot model statistics from data frame
plot_stats <- function(df, column, title) {
ggplot(df) +
aes(x = expression_type, y = {{column}}, color = expression_type) +
geom_boxplot() +
scale_color_brewer(palette = "Dark2") +
ggtitle(title) +
facet_wrap(vars(project_id), nrow = 1) +
theme(legend.position = "none")
}
model_samples_counts <- function(id, df) {
sample_df <- df |>
dplyr::filter(sample_id == id)
df_deseq <- sample_df |>
dplyr::filter(is.finite(pseudobulk_deseq), is.finite(bulk_counts))
fit_deseq <- summary(lm(bulk_counts ~ pseudobulk_deseq, data = df_deseq))
df_log_counts <- sample_df |>
dplyr::filter(is.finite(pseudobulk_log_counts), is.finite(bulk_counts))
fit_log_counts <- summary(lm(bulk_counts ~ pseudobulk_log_counts, data = df_log_counts))
# Tabulate and return some fit stats
data.frame(
expression_type = c("deseq", "log_counts"),
rsquared = c(fit_deseq$r.squared, fit_log_counts$r.squared),
coeff = c(fit_deseq$coefficients[2], fit_log_counts$coefficients[2]),
residual_sd = c(fit_deseq$sigma, fit_log_counts$sigma)
)
}
stats_df <- project_wide_df_list |>
purrr::map(
\(df) {
# We need to map over sample ids now
samples <- unique(df$sample_id)
names(samples) <- samples
fit_table <- samples |>
purrr::map(model_samples_counts, df) |>
purrr::list_rbind(names_to = "sample_id")
return(fit_table)
}
) |>
# now, combine all projects into a single table
purrr::list_rbind(names_to = "project_id")
patchwork::wrap_plots(
plot_stats(stats_df, rsquared, "rsquared"),
plot_stats(stats_df, coeff, "coeff"),
plot_stats(stats_df, residual_sd, "residual_sd"),
nrow = 3
)
SCPCP000001 and SCPCP000002 have some
outlying samples where log_counts performs worse, which are
likely the cases we see in the scatterplots where the relationship with
log_counts shows bias relative to y=xSCPCP000001 and
SCPCP000002, then SCPCP000006, then
SCPCP000009, and finally SCPCP000017 whose
relationship is weak if at all present.SCPCP000017
does have more variation here, but also the relationship is very weak in
the first place so these different coefficients are not necessarily
statistically significantly different.All the actual values are here:
stats_df
Now we’ll do the same for pseudobulk and bulk TPM
make_binned_scatterplots(
project_wide_df_list$SCPCP000001,
project_id = "SCPCP000001",
yvar = bulk_tpm,
nbins = 40,
facet_rows = 6
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000002,
project_id = "SCPCP000002",
yvar = bulk_tpm,
nbins = 30,
facet_rows = 6
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000006,
project_id = "SCPCP000006",
yvar = bulk_tpm,
nbins = 50,
facet_rows = 9
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000009,
project_id = "SCPCP000009",
yvar = bulk_tpm,
nbins = 15,
facet_rows = 1
)
make_binned_scatterplots(
project_wide_df_list$SCPCP000017,
project_id = "SCPCP000017",
yvar = bulk_tpm,
nbins = 40,
facet_rows = 7
)
We see high concentrations of points around (0,0) as
well as towards the middle values across plots. Next, we’ll look at
regression stats for these plots directly.
Let’s now get some stats for the comparison between bulk TPM and pseudobulk. We’ll fit a linear model for each sample, and display some quantities below both as boxplots and the full table.
model_samples_tpm <- function(id, df) {
sample_df <- df |>
dplyr::filter(sample_id == id)
df_deseq <- sample_df |>
dplyr::filter(is.finite(pseudobulk_deseq), is.finite(bulk_tpm))
fit_deseq <- summary(lm(bulk_tpm ~ pseudobulk_deseq, data = df_deseq))
df_log_counts <- sample_df |>
dplyr::filter(is.finite(pseudobulk_log_counts), is.finite(bulk_tpm))
fit_log_counts <- summary(lm(bulk_tpm ~ pseudobulk_log_counts, data = df_log_counts))
# Tabulate and return some fit stats
data.frame(
expression_type = c("deseq", "log_counts"),
rsquared = c(fit_deseq$r.squared, fit_log_counts$r.squared),
coeff = c(fit_deseq$coefficients[2], fit_log_counts$coefficients[2]),
residual_sd = c(fit_deseq$sigma, fit_log_counts$sigma)
)
}
stats_df <- project_wide_df_list |>
purrr::map(
\(df) {
# We need to map over sample ids now
samples <- unique(df$sample_id)
names(samples) <- samples
fit_table <- samples |>
purrr::map(model_samples_tpm, df) |>
purrr::list_rbind(names_to = "sample_id")
return(fit_table)
}
) |>
# now, combine all projects into a single table
purrr::list_rbind(names_to = "project_id")
patchwork::wrap_plots(
plot_stats(stats_df, rsquared, "rsquared"),
plot_stats(stats_df, coeff, "coeff"),
plot_stats(stats_df, residual_sd, "residual_sd"),
nrow = 3
)
SCPCP000001 and
SCPCP000002, then SCPCP000006, then
SCPCP000009, and finally SCPCP000017 whose
relationship is weak if at all present.SCPCP000017
does have more variation here, but also the relationship is very weak in
the first place so these different coefficients are not necessarily
statistically significantly different.All the actual values are here:
stats_df
Currently this section does not include bulk counts, only bulk TPM.
Next, we’ll take a quick look at cases where one modality has zero expression and the other doesn’t. In these cases, if expression is generally high, we have evidence of disagreement/discrepancy between bulk and single-cell that may be interesting to investigate. In this notebook, we’ll just a sense of how much “there is there,” and we’ll leave the in-depth look into any such genes for a subsequent notebook.
In this section, we’ll also use a threshold of 1e-12 for
zero here.
project_wide_df_list |>
purrr::map(
\(df) {
low_deseq <- df |>
dplyr::filter(pseudobulk_deseq <= 1e-12,
bulk_tpm > 1e-12)
low_logcounts <- df |>
dplyr::filter(pseudobulk_log_counts <= 1e-12,
bulk_tpm > 1e-12)
p1 <- ggplot(low_deseq) +
aes(x = sample_id, y = bulk_tpm) +
ggforce::geom_sina(size = 0.5) +
theme(axis.text.x = element_blank()) +
ggtitle("TPM for zero & negative pseudobulk_deseq")
p2 <- ggplot(low_logcounts) +
aes(x = sample_id, y = bulk_tpm) +
ggforce::geom_sina(size = 0.5) +
theme(axis.text.x = element_blank()) +
ggtitle("TPM for zero pseudobulk_log_counts")
patchwork::wrap_plots(p1, p2, nrow = 1)
}
) |>
patchwork::wrap_plots(ncol = 1)
project_wide_df_list |>
purrr::imap(
\(df, project_id) {
low_bulk <- df |>
# Even though we don't have 1:1 correspondence between pseudobulks here,
# we'll just consider only points where neither is 0 to get a sense.
dplyr::filter(bulk_tpm <= 1e-12,
pseudobulk_deseq> 1e-12,
pseudobulk_log_counts> 1e-12) |>
tidyr::pivot_longer(
contains("pseudobulk"),
names_to = "expression_type",
values_to = "expression"
)
ggplot(low_bulk) +
aes(x = sample_id, y = expression, color = expression_type) +
ggforce::geom_sina(size = 0.5) +
theme(axis.text.x = element_blank()) +
ggtitle(project_id)
}
) |>
patchwork::wrap_plots(ncol = 1, guides = "collect")
From both comparisons, there are a fair number of genes with high expression in one modality and essentially zero in the other. A more careful investigation here look into what exactly these genes are, and whether they have some biological relationship that might suggest modalities are picking up different information.
This will show genes strongly affected by different preparation/normalization approaches.
project_wide_df_list |>
purrr::imap(
\(df, project_id) {
different_low_pseudo <- df |>
dplyr::filter((pseudobulk_deseq> 1e-12 & pseudobulk_log_counts <= 1e-12) |
(pseudobulk_deseq <=1e-12 & pseudobulk_log_counts > 1e-12) ) |>
tidyr::pivot_longer(
contains("pseudobulk"),
names_to = "expression_type",
values_to = "expression"
)
ggplot(different_low_pseudo) +
aes(x = sample_id, y = expression, color = expression_type) +
ggforce::geom_sina(size = 0.5) +
theme(axis.text.x = element_blank()) +
ggtitle(project_id)
}
) |>
patchwork::wrap_plots(ncol = 1, guides = "collect")
Most values seem to be [-2.5, 2.5] for both pseudobulks,
but some are getting as high as 6-9.
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS 15.3
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_3.5.1
loaded via a namespace (and not attached):
[1] sass_0.4.9 generics_0.1.3 tidyr_1.3.1
[4] renv_1.0.11 stringi_1.8.4 lattice_0.22-6
[7] hms_1.1.3 digest_0.6.37 magrittr_2.0.3
[10] evaluate_1.0.1 grid_4.4.0 RColorBrewer_1.1-3
[13] fastmap_1.2.0 Matrix_1.7-1 rprojroot_2.0.4
[16] jsonlite_1.8.9 BiocManager_1.30.25 mgcv_1.9-1
[19] purrr_1.0.2 scales_1.3.0 tweenr_2.0.3
[22] jquerylib_0.1.4 cli_3.6.3 rlang_1.1.4
[25] crayon_1.5.3 polyclip_1.10-7 bit64_4.5.2
[28] munsell_0.5.1 splines_4.4.0 withr_3.0.2
[31] cachem_1.1.0 yaml_2.3.10 tools_4.4.0
[34] parallel_4.4.0 tzdb_0.4.0 dplyr_1.1.4
[37] colorspace_2.1-1 here_1.0.1 vctrs_0.6.5
[40] R6_2.5.1 lifecycle_1.0.4 stringr_1.5.1
[43] bit_4.5.0.1 MASS_7.3-64 vroom_1.6.5
[46] pkgconfig_2.0.3 pillar_1.10.0 bslib_0.8.0
[49] gtable_0.3.6 Rcpp_1.0.13-1 glue_1.8.0
[52] ggforce_0.4.2 xfun_0.49 tibble_3.2.1
[55] tidyselect_1.2.1 knitr_1.49 farver_2.1.2
[58] htmltools_0.5.8.1 nlme_3.1-166 patchwork_1.3.0
[61] rmarkdown_2.29 labeling_0.4.3 readr_2.1.5
[64] compiler_4.4.0